{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ab8a5e69-65f6-4bc4-be94-14fc6ef17251",
   "metadata": {
    "id": "ab8a5e69-65f6-4bc4-be94-14fc6ef17251",
    "tags": []
   },
   "source": [
    "# 1. Environment Setup\n",
    "\n",
    "## Install DashInfer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e132c0e5-3b5b-4200-b3eb-7a0bae419c57",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "id": "e132c0e5-3b5b-4200-b3eb-7a0bae419c57",
    "outputId": "28a0d5bb-53a7-4461-ea71-0d4b3e35e3d1",
    "tags": []
   },
   "outputs": [],
   "source": [
    "# install dashinfer\n",
    "!pip install dashinfer\n",
    "\n",
    "# uninstall tensorflow (if present) to avoid conflict\n",
    "!pip uninstall tensorflow -y"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "daC59uCToA9t",
   "metadata": {
    "id": "daC59uCToA9t"
   },
   "source": [
    "## Prepare Qwen Model Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1udCshW1oMr3",
   "metadata": {
    "id": "1udCshW1oMr3",
    "outputId": "9e891e5e-a8e0-4a05-9a34-d0d0c1cbfe1a",
    "tags": []
   },
   "outputs": [],
   "source": [
    "# install Qwen dependencies\n",
    "!pip install sentencepiece accelerate transformers_stream_generator tiktoken"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07864f6e-8d70-419b-8c7d-e420b8f3ebe0",
   "metadata": {
    "id": "07864f6e-8d70-419b-8c7d-e420b8f3ebe0",
    "tags": []
   },
   "source": [
    "# 2. Model Download and Chat-Template Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "daa6dab8-cf5a-4215-93c3-326018689b2e",
   "metadata": {
    "id": "daa6dab8-cf5a-4215-93c3-326018689b2e",
    "tags": []
   },
   "source": [
    "## 2.1 Model Download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "170ac1fe-b4f3-4e57-befa-52bedc94dcbe",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "id": "170ac1fe-b4f3-4e57-befa-52bedc94dcbe",
    "tags": []
   },
   "outputs": [],
   "source": [
    "## download model from modelscope\n",
    "!pip install modelscope\n",
    "from modelscope import snapshot_download\n",
    "torch_model_dir = snapshot_download(\"qwen/Qwen-1_8B-Chat\", revision=\"v1.0.0\")\n",
    "\n",
    "## download model from huggingface\n",
    "# !pip install huggingface_hub einops\n",
    "# from huggingface_hub import snapshot_download\n",
    "# torch_model_dir = snapshot_download(repo_id=\"Qwen/Qwen-1_8B-Chat\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9587b317-af79-4352-a73e-b603275d3501",
   "metadata": {
    "id": "9587b317-af79-4352-a73e-b603275d3501",
    "tags": []
   },
   "source": [
    "## 2.2 Chat Template Composition\n",
    "\n",
    "Use jinja template to format chat template for Qwen model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a002c918-d639-409e-9ab9-14a112bcde26",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2024-05-13T02:46:20.624317Z",
     "iopub.status.busy": "2024-05-13T02:46:20.623993Z",
     "iopub.status.idle": "2024-05-13T02:46:20.644530Z",
     "shell.execute_reply": "2024-05-13T02:46:20.644101Z",
     "shell.execute_reply.started": "2024-05-13T02:46:20.624299Z"
    },
    "id": "a002c918-d639-409e-9ab9-14a112bcde26",
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import copy\n",
    "import time\n",
    "import random\n",
    "from jinja2 import Template\n",
    "\n",
    "\n",
    "def apply_chatml_template(inputs, default_gen_cfg=None):\n",
    "    start_text = \"<|im_start|>\"\n",
    "    end_text = \"<|im_end|>\"\n",
    "    system_msg = {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}\n",
    "    user_msg = {\"role\": \"user\", \"content\": \"\"}\n",
    "    assistant_msg = {\"role\": \"assistant\", \"content\": \"\"}\n",
    "\n",
    "    prompt_template = Template(\n",
    "        \"{{start_text}}\" + \"{{system_role}}\\n\" + \"{{system_content}}\" + \"{{end_text}}\\n\" +\n",
    "        \"{{start_text}}\" + \"{{user_role}}\\n\" + \"{{user_content}}\" + \"{{end_text}}\\n\" +\n",
    "        \"{{start_text}}\" + \"{{assistant_role}}\\n\\n\")\n",
    "\n",
    "    gen_cfg_list = []\n",
    "    user_msg[\"content\"] = copy.deepcopy(inputs)\n",
    "\n",
    "    prompt = prompt_template.render(start_text=start_text, end_text=end_text,\n",
    "                                    system_role=system_msg[\"role\"], system_content=system_msg[\"content\"],\n",
    "                                    user_role=user_msg[\"role\"], user_content=user_msg[\"content\"],\n",
    "                                    assistant_role=assistant_msg[\"role\"])\n",
    "\n",
    "    if default_gen_cfg != None:\n",
    "        gen_cfg = copy.deepcopy(default_gen_cfg)\n",
    "        gen_cfg[\"seed\"] = random.randint(0, 10000)\n",
    "        gen_cfg_list.append(gen_cfg)\n",
    "\n",
    "    return [prompt], gen_cfg_list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27fba360-b05b-4fcd-bed9-84762c664aba",
   "metadata": {
    "id": "27fba360-b05b-4fcd-bed9-84762c664aba",
    "tags": []
   },
   "source": [
    "#  3. DashInfer Setup: engine initialization and model preparation\n",
    "\n",
    "- Convert downloaded model to be loaded by DashInfer.\n",
    "\n",
    "- Warm-Up: DashInfer warms up the engine with random tokens, up to the `max_length` specified in configuration. This allocates necessary resources to facilitate fast inference, for request(s) of length up to the `max_length`.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a5d478a2-366d-4479-8ae0-86d8139b1813",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2024-05-13T02:46:20.645367Z",
     "iopub.status.busy": "2024-05-13T02:46:20.645084Z",
     "iopub.status.idle": "2024-05-13T02:47:34.650743Z",
     "shell.execute_reply": "2024-05-13T02:47:34.650218Z",
     "shell.execute_reply.started": "2024-05-13T02:46:20.645350Z"
    },
    "id": "a5d478a2-366d-4479-8ae0-86d8139b1813",
    "outputId": "5eb7475f-b764-4770-e61e-e52cc7de0640",
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: Logging before InitGoogleLogging() is written to STDERR\n",
      "I20240513 10:46:20.667255   732 thread_pool.h:46] ThreadPool created with: 1\n",
      "I20240513 10:46:20.667313   732 as_engine.cpp:232] AllSpark Init with Version: 1.0.4/(GitSha1:0549ab25-dirty)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### convert_config: {'do_dynamic_quantize_convert': False}\n",
      "### engine_config: {'engine_max_length': 2048, 'engine_max_batch': 8, 'do_profiling': False, 'num_threads': 0, 'matmul_precision': 'medium'}\n",
      "\n",
      "No such file or directory: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.dimodel\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "E20240513 10:46:21.152429   732 as_engine.cpp:942] workers is empty\n",
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.51s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trans model from huggingface model: /mnt/workspace/.cache/modelscope/qwen/Qwen-1_8B-Chat\n",
      "Dashinfer model will save to  ./dashinfer_models/\n",
      "### model_config: {'vocab_size': 151936, 'hidden_size': 2048, 'intermediate_size': 11008, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'emb_dropout_prob': 0.0, 'attn_dropout_prob': 0.0, 'layer_norm_epsilon': 1e-06, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'max_position_embeddings': 8192, 'bf16': False, 'fp16': False, 'fp32': True, 'kv_channels': 128, 'rotary_pct': 1.0, 'rotary_emb_base': 10000, 'use_dynamic_ntk': True, 'use_logn_attn': True, 'use_flash_attn': False, 'no_bias': True, 'use_cache_quantization': False, 'use_cache_kernel': False, 'softmax_in_fp32': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['QWenLMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': 'QWenTokenizer', 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/mnt/workspace/.cache/modelscope/qwen/Qwen-1_8B-Chat', '_commit_hash': None, '_attn_implementation_internal': None, 'transformers_version': '4.32.0', 'auto_map': {'AutoConfig': 'configuration_qwen.QWenConfig', 'AutoModelForCausalLM': 'modeling_qwen.QWenLMHeadModel'}, 'model_type': 'qwen', 'onnx_safe': None, 'seq_length': 8192, 'size_per_head': 128}\n",
      "save dimodel to  ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.dimodel\n",
      "save ditensors to  ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.ditensors\n",
      "parse weight time:  8.782530069351196\n",
      "current allspark version major[ 1 ] minor[ 0 ] patch[ 4 ] commit =  0549ab25\n",
      "calculate md5 of dimodel =  2d5d46b154e9f0c8bd51274775675eea\n",
      "torch build meta: \t model_name \t:  Qwen-1_8B-Chat_cpu_single_float32\n",
      "torch build meta: \t model_type \t:  Qwen_v10\n",
      "torch build meta: \t save_dir \t:  ./dashinfer_models/\n",
      "torch build meta: \t multinode_mode \t:  False\n",
      "torch build meta: \t data_type \t:  float32\n",
      "torch build meta: \t do_dynamic_quantize_convert \t:  False\n",
      "torch build meta: \t use_dynamic_ntk \t:  True\n",
      "torch build meta: \t use_logn_attn \t:  True\n",
      "torch build meta: \t model_sequence_length \t:  2048\n",
      "torch build meta: \t seqlen_extrapolation \t:  1.0\n",
      "torch build meta: \t rotary_base \t:  10000\n",
      "serialize_model_from_torch: save model = true, time :  8.86939525604248\n",
      "convert model from HF finished, build time is 8.86966061592102 seconds\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I20240513 10:46:39.666299   732 as_engine.cpp:384] Build model use following config:\n",
      "AsModelConfig :\n",
      "\tmodel_name: Qwen-1_8B-Chat_cpu_single_float32\n",
      "\tmodel_path: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.dimodel\n",
      "\tweights_path: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.ditensors\n",
      "\tcompute_unit: CPU:0\n",
      "\tnum_threads: 32\n",
      "\tmatmul_precision: medium\n",
      "\tprefill_mode: AsPrefillDefault\n",
      "\tcache_mode: AsCacheDefault\n",
      "\tengine_max_length = 2048\n",
      "\tengine_max_batch = 8\n",
      "\n",
      "I20240513 10:46:39.666332   732 as_engine.cpp:388] Load model from : ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.dimodel\n",
      "I20240513 10:46:39.666359   732 as_engine.cpp:303] SetDeviceIds: DeviceIDs.size() 1\n",
      "I20240513 10:46:39.666363   732 as_engine.cpp:310] Start create 1 Device: CPU workers.\n",
      "I20240513 10:46:39.666476   988 cpu_context.cpp:114] CPUContext::InitMCCL() rank: 0 nRanks: 1\n",
      "I20240513 10:46:39.719002   732 as_param_check.hpp:342] AsParamGuard check level = CHECKER_NORMAL. Engine version = 1.0 . Weight version = 1.0 . \n",
      "I20240513 10:46:39.720805   732 as_engine.cpp:511] Start BuildModel\n",
      "I20240513 10:46:39.720876   990 as_engine.cpp:521] Start Build model for rank: 0\n",
      "I20240513 10:46:39.720901   990 weight_manager.cpp:131] Start Loading weight for model RankInfo[0/1]\n",
      "I20240513 10:46:39.720912   990 weight_manager.cpp:52] Start open model file ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.ditensors\n",
      "I20240513 10:46:39.720919   990 weight_manager.cpp:59] Open model file "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "build model over, build time is 3.6590442657470703\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "success. \n",
      "I20240513 10:46:39.721506   990 weight_manager.cpp:107] Weight file header parse success...195 weight tensors are going to load. \n",
      "I20240513 10:46:43.313182   990 weight_manager.cpp:257] finish weight load for model RankInfo[0/1] time  spend: 3.592 seconds.\n",
      "I20240513 10:46:43.314674   990 as_engine.cpp:525] Finish Build model for rank: 0\n",
      "I20240513 10:46:43.315203   732 as_engine.cpp:680] StartModel: warming up...\n",
      "I20240513 10:46:43.315230   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 \t Prompt: 0 T/s  Gen: 0 T/s \n",
      "I20240513 10:47:19.548533   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000000\n",
      "I20240513 10:47:19.847383   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000000\n",
      "I20240513 10:47:19.847442   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000000\n",
      "I20240513 10:47:19.847461   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 \t Prompt: 56.0053 T/s  Gen: 0.0547462 T/s \n",
      "I20240513 10:47:19.847579   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 \t Prompt: 0 T/s  Gen: 0 T/s \n",
      "I20240513 10:47:19.848335   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 \t Prompt: 0 T/s  Gen: 0 T/s \n",
      "I20240513 10:47:20.342268   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000001\n",
      "I20240513 10:47:20.549539   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 1 Pending: 0 \t Prompt: 7.13065 T/s  Gen: 2.85226 T/s \n",
      "I20240513 10:47:21.027010   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000002\n",
      "I20240513 10:47:21.367076   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 2 Pending: 0 \t Prompt: 6.11594 T/s  Gen: 3.66956 T/s \n",
      "I20240513 10:47:21.835250   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000003\n",
      "I20240513 10:47:22.279841   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 3 Pending: 0 \t Prompt: 5.47786 T/s  Gen: 4.38229 T/s \n",
      "I20240513 10:47:22.662540   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000004\n",
      "I20240513 10:47:23.233779   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 4 Pending: 0 \t Prompt: 5.24144 T/s  Gen: 5.24144 T/s \n",
      "I20240513 10:47:23.651983   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000005\n",
      "I20240513 10:47:24.298223   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 5 Pending: 0 \t Prompt: 4.69729 T/s  Gen: 5.63675 T/s \n",
      "I20240513 10:47:24.753854   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000006\n",
      "I20240513 10:47:25.949201   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000007\n",
      "I20240513 10:47:27.156082   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000008\n",
      "I20240513 10:47:28.743551   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 8 Pending: 0 \t Prompt: 3.37433 T/s  Gen: 7.19857 T/s \n",
      "I20240513 10:47:31.002952   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000001\n",
      "I20240513 10:47:31.026002   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000001\n",
      "I20240513 10:47:31.785954   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000002\n",
      "I20240513 10:47:31.788044   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000002\n",
      "I20240513 10:47:32.487955   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000003\n",
      "I20240513 10:47:32.490468   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000003\n",
      "I20240513 10:47:33.131556   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000004\n",
      "I20240513 10:47:33.134057   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000004\n",
      "I20240513 10:47:33.632434   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000005\n",
      "I20240513 10:47:33.634886   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000005\n",
      "I20240513 10:47:33.634903   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 3 Pending: 0 \t Prompt: 0 T/s  Gen: 9.40434 T/s \n",
      "I20240513 10:47:34.078444   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000006\n",
      "I20240513 10:47:34.081004   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000006\n",
      "I20240513 10:47:34.434237   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000007\n",
      "I20240513 10:47:34.436771   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000007\n",
      "I20240513 10:47:34.647670   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000008\n",
      "I20240513 10:47:34.647722   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000008\n"
     ]
    }
   ],
   "source": [
    "from IPython.display import display, clear_output\n",
    "\n",
    "from dashinfer.helper import EngineHelper\n",
    "\n",
    "# model-specific configuration\n",
    "config = {\n",
    "  \"model_name\": \"Qwen-1_8B-Chat\",\n",
    "  \"model_type\": \"Qwen_v10\",\n",
    "  \"model_path\": \"./dashinfer_models/\",\n",
    "  \"generation_config\": {\n",
    "    \"temperature\": 1.0,\n",
    "    \"early_stopping\": True,\n",
    "    \"top_k\": 1024,\n",
    "    \"top_p\": 0.8,\n",
    "    \"repetition_penalty\": 1.1,\n",
    "    \"presence_penalty\": 0.0,\n",
    "    \"max_length\": 2048,\n",
    "    \"eos_token_id\": 151643,\n",
    "    \"stop_words_ids\": [[151643], [151644], [151645]]\n",
    "  }\n",
    "}\n",
    "\n",
    "# init EngineHelper\n",
    "engine_helper = EngineHelper(config)\n",
    "engine_helper.verbose = True\n",
    "engine_helper.init_tokenizer(torch_model_dir)\n",
    "\n",
    "# convert model to DashInfer format\n",
    "if not engine_helper.check_model_exist():\n",
    "    engine_helper.convert_model(torch_model_dir)\n",
    "\n",
    "# init engine\n",
    "engine_helper.init_engine()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84d3eeb5-42c8-4d0f-a039-ad9adca81b36",
   "metadata": {
    "id": "84d3eeb5-42c8-4d0f-a039-ad9adca81b36",
    "tags": []
   },
   "source": [
    "# 4. Model-Inference via DashInfer\n",
    "\n",
    "- Inference information: available in log, such as `Prompt: ?? T/s`,  `Gen: ?? T/s`(genration speed), and `Running: ??`(concurrent running requests).\n",
    "\n",
    "- **Exit Command:**  Type `exit` to exit.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fd80ff70-ab8b-4c7d-bf9e-d6f968d0ed0e",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2024-05-13T02:47:34.651669Z",
     "iopub.status.busy": "2024-05-13T02:47:34.651446Z",
     "iopub.status.idle": "2024-05-13T02:48:36.459394Z",
     "shell.execute_reply": "2024-05-13T02:48:36.458886Z",
     "shell.execute_reply.started": "2024-05-13T02:47:34.651653Z"
    },
    "id": "fd80ff70-ab8b-4c7d-bf9e-d6f968d0ed0e",
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input: who are you\n",
      "Response:\n",
      "I am QianWen, a pre-trained language model developed by Alibaba Cloud. I was trained on a massive amount of text data to perform various tasks such as answering questions, generating text, and translating languages. How can I assist you today?\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type in your prompt:  exit\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Exiting the program.\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    while True:\n",
    "        input_value = input(\"Type in your prompt: \")\n",
    "        if input_value.lower() == 'exit':\n",
    "            print(\"Exiting the program.\")\n",
    "            break\n",
    "\n",
    "        prompt_list, gen_cfg_list = apply_chatml_template(\n",
    "            input_value, engine_helper.default_gen_cfg)\n",
    "        request_list = engine_helper.create_request(prompt_list, gen_cfg_list)\n",
    "        request = request_list[0]\n",
    "\n",
    "        gen = engine_helper.process_one_request_stream(request)\n",
    "        for part in gen:\n",
    "            clear_output(wait=True)\n",
    "            print(f\"Input: {input_value}\")\n",
    "            print(f\"Response:\\n{part}\")\n",
    "            sys.stdout.flush()\n",
    "        print()\n",
    "        time.sleep(1)\n",
    "\n",
    "except KeyboardInterrupt:\n",
    "    sys.stdout.write(\"\\nProgram interrupted. Exiting...\\n\")\n",
    "    sys.exit()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dad5f2b-9b99-4e8a-8e18-a1181c58e8c5",
   "metadata": {
    "id": "6dad5f2b-9b99-4e8a-8e18-a1181c58e8c5",
    "tags": []
   },
   "source": [
    "# 5. Engine Un-initialization\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "da989d64-2942-4cd8-924b-8c9f1ae7d12e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-05-13T02:48:36.460224Z",
     "iopub.status.busy": "2024-05-13T02:48:36.459963Z",
     "iopub.status.idle": "2024-05-13T02:48:36.463163Z",
     "shell.execute_reply": "2024-05-13T02:48:36.462631Z",
     "shell.execute_reply.started": "2024-05-13T02:48:36.460203Z"
    },
    "id": "da989d64-2942-4cd8-924b-8c9f1ae7d12e",
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I20240513 10:48:36.460870   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 \t Prompt: 0 T/s  Gen: 0.249401 T/s \n",
      "I20240513 10:48:36.460969   732 as_engine.cpp:859] [Qwen-1_8B-Chat_cpu_single_float32] waiting to join loop thread\n",
      "I20240513 10:48:36.460994   732 as_engine.cpp:862] [Qwen-1_8B-Chat_cpu_single_float32] loop thread joined\n"
     ]
    }
   ],
   "source": [
    "# release all resources allocated to DashInfer engine\n",
    "engine_helper.uninit_engine()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
